Part 1: PCA with penguins

#wrangling, selecting variables, and getting rid of NAs, Scale Data to make sure weighting is even, puting variables on similar scales. 

penguin_pca <- penguins %>% 
  select(body_mass_g, ends_with("_mm")) %>%  #helper function ends with, keeps variables that end with _mm
  drop_na() %>% 
  scale() %>%  #values have been scaled
  prcomp()

#penguin_pca? will let you see what options you have in functions for variables 
#penguin_pca$rotation?

penguin_complete <- penguins %>% 
  drop_na(body_mass_g, ends_with("mm")) #oinly drop observations to make data set where these variables are concerned. This is making a dataset from penguins that contain same observations from PCA biplot with other variables not included in PCA but might want to sue to change aesthetic of graph.

autoplot(penguin_pca,
         data = penguin_complete,
         colour = 'species',
         loadings = TRUE,
         loadings.label = TRUE)+  # autoplot recognizes type of data and assumes type of graph we want to create. Showslocations in multivariate space ploted on PC1 and PC2. locations on pc1 and pc2 of all obsrvations. some things are missing though, we may want our loadings for variables so we could look for correlations; and we may want to show what cc;s these dots represent. So to do that we need a copmplete set of the penguins for PCA that will align with PCA observations. So need to make a new data set that matches observations to create PCA biplot and still contain other variables. see above

#PCA biplot captures 90% of variance in the data. 

# loadings gives you the arrows (the loadings) label them with loadings.label ! 
  theme_minimal()
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Part 2: ggplot2 customization & reading file types

Read in an .xlsx file & do some wrangling

#mutate is to transform columns

fish_noaa <- read_excel(here("data", "foss_landings.xlsx")) %>% 
  clean_names() %>%
  mutate(across(where(is.character), tolower)) %>% # change these columns that i select, accross these columns that are characters, change it to lower case
  mutate(nmfs_name = str_sub(nmfs_name, end = -4)) %>%  # in this mutate function i want what exists in the nmfs_name column to actually be without the end four letters.
  filter(confidentiality == "public")

Make a customized graph:

fish_plot <- ggplot(data = fish_noaa, aes(x = year, y = pounds))+
  geom_line(aes(color = nmfs_name), show.legend = FALSE)+ #need to hide legend to show graph bc so many species 
  theme_minimal()

fish_plot
## Warning: Removed 6 row(s) containing missing values (geom_path).

ggplotly(fish_plot) #creates an interactive graph
### Use gg highlight to highlight certain series 

fish_plot <- ggplot(data = fish_noaa, aes(x = year, y = pounds, group = nmfs_name))+
  geom_line()+ #need to hide legend to show graph bc so many species 
  theme_minimal()+
  gghighlight(nmfs_name == "tunas") #highlights one series on a graph
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...
## label_key: nmfs_name
fish_plot <- ggplot(data = fish_noaa, aes(x = year, y = pounds, group = nmfs_name))+
  geom_line(aes(color = nmfs_name))+ #adding in color here with gghighlioght will only color the characters that you specify to highlight 
  theme_minimal()+
  gghighlight(max(pounds) > 1e8) #highlight series where maximum value of greater than 1*10^8
## label_key: nmfs_name

Read in from a URL, lubridate() , mutate(), make a graph with months in logical order

monroe_wt <- read_csv("https://data.bloomington.in.gov/dataset/2c81cfe3-62c2-46ed-8fcf-83c1880301d1/resource/13c8f7aa-af51-4008-80a9-56415c7c931e/download/mwtpdailyelectricitybclear.csv") %>% 
  clean_names() 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   date = col_character(),
##   kWh1 = col_double(),
##   kW1 = col_double(),
##   kWh2 = col_double(),
##   kW2 = col_double(),
##   solar_kWh = col_double(),
##   total_kWh = col_double(),
##   MG = col_double()
## )
#create a graph that shows by month how much total energy this water plant uses. 1. needs r to understand data column is a date. Add new column adding the month abbreviation through a built in function month.abb. 3. convert month name to a factor where order of factor is based ont he month number associated to it so months are in logical order.

monroe_ts <- monroe_wt %>% 
  mutate(date = mdy(date)) %>%  #changed date column to nice data format YYYY-MM-DD
  mutate(record_month = month(date))%>% #pull just the month from that date and put it in its own column 
  mutate(month_name = month.abb[record_month]) %>%  #makes a new column where the month number is translated to written three letters... 1 -> Jan; 4 -> Apr
  mutate(month_name = fct_reorder(month_name, record_month)) #converts character to factor and set order based on order in other variable. This made it so months ran in logical order 

ggplot(data = monroe_ts, aes(month_name, y = total_k_wh))+
  geom_jitter() 

Part 3: Copund Figures with patchwork

Compound figures: multiple graphs but part of one figure in chosen layour

graph_a <- ggplot(data = penguins, aes(x = body_mass_g, y = flipper_length_mm))+
  geom_point()

graph_b <- ggplot(data = penguins, aes(x = species, y = flipper_length_mm))+
  geom_jitter(aes(color = species), show.legend = FALSE)

#Use | to put graphs side by side 
# Use / sign to put graphs one over the other 

(graph_a | graph_b) / fish_plot & theme_dark()
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 6 row(s) containing missing values (geom_path).

graph_c <- (graph_a | graph_b) / fish_plot & theme_dark()

graph_c
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 6 row(s) containing missing values (geom_path).

ggsave(here("fig", "graph_c_ah.png"), width = 5, height = 6)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 6 row(s) containing missing values (geom_path).